Source Code Analysis

Initialization

Load Data

In [1]:
import pandas as pd


seed=77
def load_dataset(path):
    return pd.read_csv(path, index_col=0)

df = load_dataset("./sample/sample_12_May_2020.csv")

df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4009 entries, 0 to 4008
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   _id                4009 non-null   object 
 1   html_text          3966 non-null   object 
 2   netloc             4009 non-null   object 
 3   params             0 non-null      float64
 4   path               4009 non-null   object 
 5   scheme             4009 non-null   object 
 6   scraped            4009 non-null   bool   
 7   snapshot_img_path  3966 non-null   object 
 8   status             4009 non-null   object 
 9   timestamp          4009 non-null   int64  
 10  url                4009 non-null   object 
dtypes: bool(1), float64(1), int64(1), object(8)
memory usage: 348.4+ KB

Train Test Split

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit

X_train, X_test = train_test_split(df, test_size=0.3, random_state=seed)

def extract_dv(df):
    return df.status == 'SUCCESS'

display(extract_dv(X_train).head())
display(extract_dv(X_test).head())
580     True
1663    True
3892    True
130     True
2946    True
Name: status, dtype: bool
2147    True
2677    True
1308    True
933     True
2975    True
Name: status, dtype: bool

Feature Engineering

In [3]:
df.head()
Out[3]:
_id html_text netloc params path scheme scraped snapshot_img_path status timestamp url
0 5e87e296e27b5dafe7a08d1f <html xmlns="http://www.w3.org/1999/xhtml" xml... us.sagepub.com NaN /en-us/nam/open-access-at-sage https True /home/jjian03/Desktop/workspace/website_qualit... SUCCESS 20200403223802 https://us.sagepub.com/en-us/nam/open-access-a...
1 5e87e296e27b5dafe7a08d20 <html xmlns="http://www.w3.org/1999/xhtml" dir... bioconductor.org NaN / http True /home/jjian03/Desktop/workspace/website_qualit... SUCCESS 20200403223804 http://bioconductor.org/
2 5e87e296e27b5dafe7a08d21 <html lang="en"><head>\n <meta charset="utf... www.r-project.org NaN / http True /home/jjian03/Desktop/workspace/website_qualit... SUCCESS 20200403223806 http://www.r-project.org/
3 5e87e296e27b5dafe7a08d22 <html class="dj_quirks dj_webkit dj_chrome dj_... earray.chem.agilent.com NaN /earray/ https True /home/jjian03/Desktop/workspace/website_qualit... SUCCESS 20200403223826 https://earray.chem.agilent.com/earray/
4 5e87e296e27b5dafe7a08d23 <html lang="en-US" class="js csstransforms css... www.partek.com NaN / http True /home/jjian03/Desktop/workspace/website_qualit... SUCCESS 20200403223830 http://www.partek.com/?q=partekgs
In [4]:
def print_uniqueValue(df):
    df_unique = pd.DataFrame()
    for col_name in df.columns:
        df_unique[col_name] = [len(df[col_name].unique())]

    df_unique['total'] = [len(df)]
    df_unique.index = ['unique count']
    display(df_unique.T)

print_uniqueValue(df)
unique count
_id 4009
html_text 3296
netloc 2262
params 1
path 3109
scheme 3
scraped 1
snapshot_img_path 3967
status 2
timestamp 1271
url 4009
total 4009

Features in URL

Length of the url hierarchy

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
pd.options.mode.chained_assignment = None


class URLLengthCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:,'url_length'] = result['url'].apply(self._get_length)
        return result

    def _get_length(self, url):
        return len(url)


pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
])

result = pipe.transform(X_train)

display(result[['url', 'url_length']].head(5))
url url_length
580 http://pi.shirecontent.com/PI/PDFs/Intuniv_USA... 54
1663 http://swopec.hhs.se/eijswp/abs/eijswp0162.htm 46
3892 http://jhep.sissa.it/ 21
130 http://www.gadeta.nl/ 21
2946 http://www.graphpad.com/quickcalcs/ttest1.cfm 45

Depth of the url hierarchy

In [6]:
class URLDepthCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:,'url_depth'] = result['path'].apply(self._get_depth)
        return result

    def _get_depth(self, path):
        last_idx = path.rindex('/')
        if last_idx + 1 < len(path):
            last_idx = len(path)
        return path[:last_idx].count('/')

pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
])

result = pipe.transform(X_train)

display(result[['path', 'url_depth']].head(5))
path url_depth
580 /PI/PDFs/Intuniv_USA_ENG.pdf 3
1663 /eijswp/abs/eijswp0162.htm 3
3892 / 0
130 / 0
2946 /quickcalcs/ttest1.cfm 2

Has WWW subdomain

In [7]:
class HasWWWConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:, 'has_www'] = result['netloc'].apply(self._has_www)
        return result

    def _has_www(self, domain):
        return int(domain.startswith('www.'))


pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
])

result = pipe.transform(X_train)

display(result[['netloc', 'has_www']].head(5))
netloc has_www
580 pi.shirecontent.com 0
1663 swopec.hhs.se 0
3892 jhep.sissa.it 0
130 www.gadeta.nl 1
2946 www.graphpad.com 1

Level of the Subdomain

In [8]:
class SubdomainLevelCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:, 'subdomain_level'] = result['netloc'].apply(self._get_level)
        return result

    def _get_level(self, domain):
        return domain.count('.')


pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
])

result = pipe.transform(X_train)

display(result[['netloc', 'subdomain_level']].head(5))
netloc subdomain_level
580 pi.shirecontent.com 2
1663 swopec.hhs.se 2
3892 jhep.sissa.it 2
130 www.gadeta.nl 2
2946 www.graphpad.com 2

Number of HTTP-Get parameters

In [9]:
import numpy as np


class RequestParameterCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result['params'] = result['params'].replace(np.nan, '', regex=True)
        result.loc[:, 'param_cnt'] = result['params'].apply(self._count_param)
        return result

    def _count_param(self, params):
        if params is '':
            return 0
        return params.count('&') + 1

pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
])

result = pipe.transform(X_train)

display(result[['params', 'param_cnt']].head(5))
params param_cnt
580 0
1663 0
3892 0
130 0
2946 0

Domain Suffix

In [10]:
!pip3 install feature_engine
Requirement already satisfied: feature_engine in /Applications/anaconda3/lib/python3.7/site-packages (0.4.2)
Requirement already satisfied: pandas<1.1.0,>=1.0.3 in /Applications/anaconda3/lib/python3.7/site-packages (from feature_engine) (1.0.3)
Requirement already satisfied: scipy<1.5.0,>=1.4.1 in /Applications/anaconda3/lib/python3.7/site-packages (from feature_engine) (1.4.1)
Requirement already satisfied: statsmodels<0.12.0,>=0.11.1 in /Applications/anaconda3/lib/python3.7/site-packages (from feature_engine) (0.11.1)
Requirement already satisfied: scikit-learn<0.23.0,>=0.22.2 in /Applications/anaconda3/lib/python3.7/site-packages (from feature_engine) (0.22.2.post1)
Requirement already satisfied: numpy<1.19.0,>=1.18.2 in /Applications/anaconda3/lib/python3.7/site-packages (from feature_engine) (1.18.4)
Requirement already satisfied: python-dateutil>=2.6.1 in /Applications/anaconda3/lib/python3.7/site-packages (from pandas<1.1.0,>=1.0.3->feature_engine) (2.8.0)
Requirement already satisfied: pytz>=2017.2 in /Applications/anaconda3/lib/python3.7/site-packages (from pandas<1.1.0,>=1.0.3->feature_engine) (2019.3)
Requirement already satisfied: patsy>=0.5 in /Applications/anaconda3/lib/python3.7/site-packages (from statsmodels<0.12.0,>=0.11.1->feature_engine) (0.5.1)
Requirement already satisfied: joblib>=0.11 in /Applications/anaconda3/lib/python3.7/site-packages (from scikit-learn<0.23.0,>=0.22.2->feature_engine) (0.13.2)
Requirement already satisfied: six>=1.5 in /Applications/anaconda3/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas<1.1.0,>=1.0.3->feature_engine) (1.12.0)
In [11]:
from feature_engine import categorical_encoders


class DomainSuffixBuilder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._suffix_mapping = None

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        # Remove incorrect urls
        result = result[result['netloc'].apply(lambda x: '.' in x)]
        result.loc[:, 'suffix'] = result.netloc.apply(DomainSuffixBuilder._get_url_suffix)
        result.loc[:, 'is_port_access'] = result.suffix.apply(DomainSuffixBuilder._is_port_access)
        result.loc[:, 'suffix_idx'] = result.suffix.apply(DomainSuffixBuilder._clean_url_suffix)
        encoder = categorical_encoders.CountFrequencyCategoricalEncoder(
            encoding_method='frequency',
            variables=['suffix'])
        result = encoder.fit_transform(result)
        self._suffix_dict = encoder.encoder_dict_['suffix']
        return result

    @property
    def suffix_dict(self):
        return self._suffix_dict

    @staticmethod
    def _get_url_suffix(url):
        last_idx = url.rindex('.')
        return url[last_idx + 1:]

    @staticmethod
    def _clean_url_suffix(url):
        return url.split(':')[0]

    @staticmethod
    def _is_port_access(suffix):
        return int(len([token for token in suffix.split(':') if token.strip() != ''])>1)


pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
])

result = pipe.transform(X_train)


display(result[['netloc', 'is_port_access', 'suffix', 'suffix_idx']].head(5))
pipe.steps[-1][1].suffix_dict
netloc is_port_access suffix suffix_idx
580 pi.shirecontent.com 0 0.167500 com
1663 swopec.hhs.se 0 0.005000 se
3892 jhep.sissa.it 0 0.007500 it
130 www.gadeta.nl 0 0.006786 nl
2946 www.graphpad.com 0 0.167500 com
Out[11]:
{'org': 0.2725,
 'com': 0.1675,
 'gov': 0.15892857142857142,
 'edu': 0.07392857142857143,
 'uk': 0.0475,
 'jp': 0.0375,
 'net': 0.026785714285714284,
 'int': 0.02035714285714286,
 'eu': 0.02035714285714286,
 'de': 0.01892857142857143,
 'au': 0.015714285714285715,
 'cn': 0.014285714285714285,
 'fr': 0.013214285714285715,
 'ca': 0.012857142857142857,
 'it': 0.0075,
 'be': 0.007142857142857143,
 'nl': 0.0067857142857142855,
 'br': 0.006071428571428571,
 'ch': 0.005,
 'se': 0.005,
 'dk': 0.004285714285714286,
 'tw': 0.004285714285714286,
 'at': 0.003928571428571429,
 'ae': 0.0032142857142857142,
 'pl': 0.0032142857142857142,
 'in': 0.002857142857142857,
 'info': 0.002857142857142857,
 'ie': 0.0025,
 'es': 0.0025,
 'no': 0.0017857142857142857,
 'us': 0.0014285714285714286,
 'ru': 0.0014285714285714286,
 'nz': 0.0014285714285714286,
 'tr': 0.0010714285714285715,
 'kr': 0.0010714285714285715,
 'mil': 0.0010714285714285715,
 'sa': 0.0010714285714285715,
 'io': 0.0010714285714285715,
 'gl': 0.0010714285714285715,
 'sg': 0.0010714285714285715,
 'il': 0.0010714285714285715,
 'si': 0.0010714285714285715,
 'hk': 0.0010714285714285715,
 'om': 0.0007142857142857143,
 'ee': 0.0007142857142857143,
 '': 0.0007142857142857143,
 'za': 0.0007142857142857143,
 'my': 0.0007142857142857143,
 'fi': 0.0007142857142857143,
 'gr': 0.0007142857142857143,
 'ws': 0.00035714285714285714,
 'sc': 0.00035714285714285714,
 '95': 0.00035714285714285714,
 'com:3838': 0.00035714285714285714,
 '7624': 0.00035714285714285714,
 'en': 0.00035714285714285714,
 '26': 0.00035714285714285714,
 '186': 0.00035714285714285714,
 'is': 0.00035714285714285714,
 'ua': 0.00035714285714285714,
 'org44': 0.00035714285714285714,
 'edu:8090': 0.00035714285714285714,
 'cu': 0.00035714285714285714,
 'cl': 0.00035714285714285714,
 'pt': 0.00035714285714285714,
 'zw': 0.00035714285714285714,
 'edu:8080': 0.00035714285714285714,
 'pa': 0.00035714285714285714,
 'cz': 0.00035714285714285714,
 'md': 0.00035714285714285714,
 'qa': 0.00035714285714285714,
 '106': 0.00035714285714285714,
 'nlm': 0.00035714285714285714,
 '22': 0.00035714285714285714,
 'kw': 0.00035714285714285714,
 'mlstransposonet': 0.00035714285714285714,
 'mutations': 0.00035714285714285714,
 'ncbi': 0.00035714285714285714,
 'ar': 0.00035714285714285714,
 '90:8080': 0.00035714285714285714}

Remove the Incorrect Domains

In [12]:
import re


class IncorrectDomainUrlCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        # TLD ranges from 2 to 63
        self._regex = re.compile(r'^[a-zA-Z]{2,63}$', re.I)

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result.loc[:, 'is_correct'] = result.suffix_idx.apply(self._is_correct)
        result = result[result.is_correct]
        result = result.drop('is_correct', axis=1)
        return result

    def _is_correct(self, domain_suffix):
        return True if self._regex.match(domain_suffix) else False


pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
])

result = pipe.transform(X_train)

print(f'Before changes: {len(X_train)}')
print(f'After changes: {len(result)}')
Before changes: 2806
After changes: 2790

Protocol Type Conversion

In [13]:
from feature_engine import categorical_encoders


class ColumnRenamer(BaseEstimator, TransformerMixin):
    def __init__(self, mapping):
        self._mapping = mapping

    @property
    def mapping(self):
        return self._mapping

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        self._mapping = {key: value for key, value in self._mapping.items() if key in result.columns}
        result = result.rename(columns=self._mapping)
        return result

pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
])

result = pipe.transform(X_train)

display(result[['url', 'protocol_type']].head(5))
url protocol_type
580 http://pi.shirecontent.com/PI/PDFs/Intuniv_USA... http
1663 http://swopec.hhs.se/eijswp/abs/eijswp0162.htm http
3892 http://jhep.sissa.it/ http
130 http://www.gadeta.nl/ http
2946 http://www.graphpad.com/quickcalcs/ttest1.cfm http

Features in source code

In [ ]:
 

Code length(kb)

Text Mining

type of Javascript framework
No of JS files
Language schema
HTML version
iFrame in Body
Remove tags, Tf-Idf Score of Body
Tf-Idf Score of Header

EDA

In [14]:
print_uniqueValue(result)
result.info()
unique count
_id 2790
html_text 2364
netloc 1681
params 1
path 2195
protocol_type 3
scraped 1
snapshot_img_path 2765
status 2
timestamp 1116
url 2790
url_length 142
url_depth 11
has_www 2
subdomain_level 7
param_cnt 1
suffix 28
is_port_access 2
suffix_idx 68
total 2790
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2790 entries, 580 to 2775
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   _id                2790 non-null   object 
 1   html_text          2764 non-null   object 
 2   netloc             2790 non-null   object 
 3   params             2790 non-null   object 
 4   path               2790 non-null   object 
 5   protocol_type      2790 non-null   object 
 6   scraped            2790 non-null   bool   
 7   snapshot_img_path  2764 non-null   object 
 8   status             2790 non-null   object 
 9   timestamp          2790 non-null   int64  
 10  url                2790 non-null   object 
 11  url_length         2790 non-null   int64  
 12  url_depth          2790 non-null   int64  
 13  has_www            2790 non-null   int64  
 14  subdomain_level    2790 non-null   int64  
 15  param_cnt          2790 non-null   int64  
 16  suffix             2790 non-null   float64
 17  is_port_access     2790 non-null   int64  
 18  suffix_idx         2790 non-null   object 
dtypes: bool(1), float64(1), int64(7), object(10)
memory usage: 416.9+ KB
In [15]:
import plotly

import plotly.graph_objects as go
from plotly.subplots import make_subplots


pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
])

result = pipe.transform(X_train)

non_binary_result = result[['protocol_type', 'url_length', 'url_depth', 'subdomain_level', 'param_cnt', 'suffix_idx']]

def plot_distribution(data, title):
    fig = make_subplots(rows=len(data.columns), cols=1,
                    subplot_titles=data.columns)

    for idx, col_name in enumerate(data.columns):
        fig.add_trace(go.Histogram(x=data[col_name], name=col_name), row=idx + 1, col=1)


    fig.update_layout(height=1200, width=800, title_text=title)
    return fig

plot_distribution(non_binary_result, "Non Binary Features Distribution")
In [16]:
binary_result = result[['status', 'has_www', 'is_port_access']]

plot_distribution(binary_result, "Binary Features Distribution")

Most of the non-binary feature are right skewed, it is necessary to apply the standard scaler at the later process.

Modeling

Data Cleaning

Convert Timestamp to the Inversed Number of Month From the Available Time

Timesamp will not be used in the logistic regression, it is directly correlated with the dependent variable

In [17]:
import time
import datetime


class TimeseriesConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._scraped_dt = datetime.datetime.strptime('20200513132015', "%Y%m%d%H%M%S")

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result.loc[:, 'timestamp'] = result.timestamp.astype(str)
        result.loc[:, 'timestamp'] = result.timestamp.apply(lambda x: datetime.datetime.strptime(x, "%Y%m%d%H%M%S"))
        result.loc[:, 'timestamp_coef'] = 1/(self._scraped_dt - result.timestamp).apply(lambda x: x.days)
        return result

pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('timeseries_converter', TimeseriesConverter()),
])

result = pipe.transform(X_train)

result.timestamp_coef.head()
Out[17]:
580     0.025641
1663    0.025641
3892    0.025641
130     0.025641
2946    0.025641
Name: timestamp_coef, dtype: float64

Remove redundant features

In [18]:
class FeatureRemover(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self._removed_features = None
        self._features = features

    @property
    def removed_features(self):
        return self._removed_features

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        self._removed_features = [col_name for col_name in self._features if col_name in result.columns]
        result = result.drop(self._removed_features, axis=1)
        return result


class FeaturePicker(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self._picked_features = None
        self._features = features

    @property
    def picked_features(self):
        return self._picked_features

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        self._picked_features = [col_name for col_name in self._features if col_name in result.columns]
        result = result[self._picked_features]
        return result


pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
#     ('timeseries_converter', TimeseriesConverter()),
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'status',
                                       ])),
])

result = pipe.transform(X_train)

result.columns
Out[18]:
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'param_cnt',
       'suffix', 'is_port_access', 'status'],
      dtype='object')

Miscellaneous Clean Up

  • Standardize variance
  • Convert Categorical Feature into Frequency Based Numberical Index
  • Remove low variance features
In [19]:
from itertools import compress
from sklearn import feature_selection


class LowVarianceRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold):
        self._p = threshold
        self._bi_vt = feature_selection.VarianceThreshold(threshold=threshold*(1-threshold))
        self._regular_vt = feature_selection.VarianceThreshold(threshold=threshold)
        self._dropped_columns = list()

    @property
    def threshold(self):
        return self._threshold

    @property
    def dropped_columns(self):
        return self._dropped_columns

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x

        df_unique = pd.DataFrame()
        for col_name in result.columns:
            if 'status' != col_name:
                df_unique[col_name] = [len(result[col_name].unique())]

        df_unique.index = ['unique count']
        df_unique = df_unique.T.squeeze()

        bi_columns = df_unique[df_unique == 2].index.tolist()
        regular_columns = df_unique[df_unique != 2].index.tolist()

        if len(bi_columns) >0:
            self._bi_vt.fit(result[bi_columns])
            bi_mask = self._bi_vt.variances_ < self._p * (1 - self._p)            
            self._dropped_columns = self._dropped_columns + list(compress(bi_columns, bi_mask))
        if len(regular_columns) >0 :
            self._regular_vt.fit(result[regular_columns])
            regular_mask = self._regular_vt.variances_ < self._p
            self._dropped_columns = self._dropped_columns + list(compress(regular_columns, regular_mask))

        if len(self._dropped_columns) > 0:
            remover = FeatureRemover(self._dropped_columns)
            result = remover.transform(result)
        return result


pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
#     ('timeseries_converter', TimeseriesConverter()),
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'status',
                                       ])),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type'])),
    ('low_variance_remover', LowVarianceRemover(0.01))

])


result = pipe.fit_transform(X_train)


print(f'Before transform: {X_train.columns}\n')
print(f'After transform: {result.columns}\n')
print(f'Dropped columns: {pipe.steps[-1][1].dropped_columns}')
Before transform: Index(['_id', 'html_text', 'netloc', 'params', 'path', 'scheme', 'scraped',
       'snapshot_img_path', 'status', 'timestamp', 'url', 'url_length',
       'url_depth', 'has_www', 'subdomain_level', 'param_cnt'],
      dtype='object')

After transform: Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'suffix',
       'status'],
      dtype='object')

Dropped columns: ['is_port_access', 'param_cnt']

The port indicator is wiped out, but I believe this could be a reason to explain the availability of the url resource, so I will separately build a subset to analyze that part later.

Add Sklearn Build-in Function

In [20]:
from sklearn import preprocessing


class CustomizedStandardizer(BaseEstimator, TransformerMixin):
    def __init__(self, norm='l2'):
        self._pipe = Pipeline([
            ('normalizer', preprocessing.Normalizer(norm=norm, copy=True)),
            ('standard_scaler', preprocessing.StandardScaler()),

        ])
        self._columns = None

    @property
    def columns(self):
        return self._columns

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        self._columns = x.drop('status', axis=1).columns
        self._columns = [*self._columns, 'status']
        result = self._pipe.fit_transform(x.drop('status', axis=1))
        dv = x.status.apply(lambda v: 1 if 'SUCCESS' == v else 0).tolist()
        dv = np.array([dv]).T
        result = np.append(result, dv, axis=1)
        return result

pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
#     ('timeseries_converter', TimeseriesConverter()),
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'status'
                                       ])),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type'])),
    ('low_variance_remover', LowVarianceRemover(0.01)),
    ('standard_scaler', CustomizedStandardizer(norm='l2')),

])

result = pipe.fit_transform(X_train)

result = pd.DataFrame(result, columns= pipe.steps[-1][1].columns)

plot_distribution(result, "Standardized Features Distribution")

Logistic Regression

In [21]:
import gc
import multiprocessing

import warnings
warnings.filterwarnings("ignore")


cpu_cnt = multiprocessing.cpu_count()
allocated_cpu = cpu_cnt
print(f"Allocated {allocated_cpu} CPUs")
gc.collect()
Allocated 16 CPUs
Out[21]:
7709
In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.naive_bayes import GaussianNB


class AnalysisEngineBuilder:

    def __init_(self):
        self._X_train = None
        self._y_train = None
        self._X_test = None
        self._y_test = None
        self._param_grid = None
        self._engine = None
    def set_X_train(self, X_train):
        self._X_train = X_train
        return self
    def set_y_train(self, y_train):
        self._y_train = y_train
        return self
    def set_X_test(self, X_test):
        self._X_test = X_test
        return self
    def set_y_test(self, y_test):
        self._y_test = y_test
        return self
    def set_param_grid(self, param_grid):
        self._param_grid = param_grid
        return self
    def set_engine(self, engine):
        self._engine = engine
        return self
    def build(self):
        return AnalysisEngineBuilder._AnalysisEngine(self._X_train, self._y_train, self._X_test, self._y_test, self._param_grid, self._engine)

    class _AnalysisEngine:
        def __init__(self, X_train, y_train, X_test, y_test, param_grid, engine):
            self._X_train = X_train
            self._y_train = y_train
            self._X_test = X_test
            self._y_test = y_test
            self._param_grid = param_grid
            self._engine = engine
            self._grid = GridSearchCV(self._engine, self._param_grid, cv=10, scoring='accuracy')
            self._pred = None
            self._pred_prob = None
            self._accuracy = None
            self._roc = None
            self._tpr = None
            self._fpr = None
        @property
        def grid_search_result(self):
            return pd.DataFrame(self._grid.cv_results_)
        @property
        def accuracy(self):
            return self._accuracy
        @property
        def roc(self):
            return self._roc
        @property
        def tpr(self):
            return self._tpr
        @property
        def fpr(self):
            return self._fpr
        @property
        def threshold(self):
            return self._threshold
        def analyze(self):
            self._grid.fit(self._X_train, self._y_train)
            self._pred = self._grid.predict(self._X_test)
            self._fpr, self._tpr, self._threshold = roc_curve(self._y_test, self._pred)
            try:
                self._pred_prob = self._grid.predict_proba(self._X_test)
                self._fpr, self._tpr, self._threshold = roc_curve(self._y_test, pd.DataFrame(self._pred_prob)[1])
            except AttributeError as ae:
                pass
            self._accuracy = accuracy_score(self._y_test, self._pred)
            self._roc = roc_auc_score(self._y_test, self._pred)

            return self._grid

        def show_performance(self):
            print(f"ROC/AUC: {round(self._roc*100, 2)}%")
            print()
            print(classification_report(self._y_test, self._pred, target_names=["Disease","Health"]))
In [23]:
import matplotlib


class Visualizer():
    @staticmethod
    def group_plot_roc_curve(title, data_group):
        plt.clf()
        plt.figure(figsize=(5, 5), dpi=80)

        x = [0.0, 1.0]
        plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='Naive prediction (Random guess)')
        for idx, group in enumerate(data_group):
            fpr = group[0]
            tpr = group[1]
            label = group[2]
            linestyle= 'solid'
            if idx % 2 == 1:
                linestyle= 'dashed'
            plt.plot(fpr, tpr, linestyle=linestyle, linewidth=2, label=label)

        plt.xlim(0.0, 1.0)
        plt.ylim(0.0, 1.0)
        plt.xlabel("FPR", fontsize=14)
        plt.ylabel("TPR", fontsize=14)

        plt.legend(fontsize=10, loc='best')

        plt.title(title, fontsize=14)
        plt.tight_layout()

        display()

    @staticmethod
    def plot_performance(data,
                            legend_type_name,
                            x_axis_name,
                            upper_y_label,
                            lower_y_label,
                            title):
        plt.clf()
        f, ax = plt.subplots(2, 1, figsize=(15,8))
        legends = data[legend_type_name].unique()
        for idx, legend in enumerate(legends):
            _data = data[data[legend_type_name]==legend]
            ax[0].plot(_data[x_axis_name], _data[upper_y_label], linewidth=2, label=f'{legend_type_name}: {legend}')
            ax[0].set_xlabel(x_axis_name, fontsize=15)
            ax[0].set_ylabel(upper_y_label.upper(), fontsize=15)
            ax[0].legend(fontsize=10, loc='upper right')

            ax[1].plot(_data[x_axis_name], _data[lower_y_label], linewidth=2, label=f'{legend_type_name}: {legend}')
            ax[1].set_xlabel(x_axis_name, fontsize=15)
            ax[1].set_ylabel(lower_y_label.upper(), fontsize=15)
            ax[1].legend(fontsize=10, loc='lower right')

        ax[0].set_title(f"Performance Evaluation of {title}", fontsize=24)
        plt.tight_layout()

        display()

    @staticmethod
    def plot_feature_importance(reg_coef, col_names, title):
        reg_coef = pd.Series(reg_coef, index=col_names)
        reg_coef = reg_coef.sort_values()
        matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
        reg_coef.plot(kind="barh",)
        plt.title(title, fontsize=15)
    
    @staticmethod
    def plot_importance_trending(X_train, feature_importance_matrix, title):
        feature_importance = feature_importance_matrix.groupby('C').agg(['mean'])[[*X_train.columns]]
        feature_importance.columns = X_train.columns.tolist()
        feature_importance['C'] = feature_importance.index
        
        column_names = X_train.columns
        lbds = feature_importance['C'].tolist()
        coef_matrix = feature_importance[X_train.columns]
        x_lab = 'Lambda'
        y_lab = 'Weight'
        plt.clf()
        plt.figure(figsize=(15, 10))
        for idx, col_name in enumerate(column_names):
            plt.plot(lbds, coef_matrix.iloc[:,idx], 'o-', linewidth=2, label=col_name)
            c = coef_matrix.iloc[0,idx]
            plt.annotate(col_name, (lbds[3], coef_matrix.iloc[3,idx]))

        plt.title(title, fontSize=25)
        plt.xlabel(x_lab)
        plt.ylabel(y_lab)

        plt.legend(loc='upper right')
        plt.tight_layout()
        display()
In [24]:
from concurrent.futures.thread import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")
from concurrent.futures.thread import ThreadPoolExecutor
from sklearn.metrics import hinge_loss


def loss_accuracy_analyze_job_builder(X_train, y_train, X_test, y_test, model_func, param):
    def _analyze_param_combination():
        engine = AnalysisEngineBuilder() \
                    .set_X_train(X_train) \
                    .set_y_train(y_train) \
                    .set_X_test(X_test) \
                    .set_y_test(y_test) \
                    .set_param_grid(param) \
                    .set_engine(model_func) \
                    .build()
        model = engine.analyze()
        
        # Performance scores
        loss = hinge_loss(y_test, pd.DataFrame(model.predict_proba(X_test))[1])
        auc = roc_auc_score(y_test, model.predict(X_test))
        
        coef = pd.Series(model.best_estimator_.coef_[0], index=X_test.columns).to_dict()
        _param = param
        for key, value in param.items():
            _param[key] = value[0]
        return {
            'accuracy': engine.accuracy * 100,
            'loss': loss,
            'auc': auc,
            **coef,
            **_param
        }
    return _analyze_param_combination

# Refactor into the analyzer later on
def calculate_grid_performance(X_train, y_train, X_test, y_test, params, model):
    # build combination list
    combination_list = pd.DataFrame({'dummy': [1]})
    for key, values in params.items():
        combination_list = pd.merge(combination_list, pd.DataFrame({key: values, 'dummy': [1] * len(values)}))
    combination_list.drop('dummy',axis=1, inplace=True)

    # Train and extract scores
    futures = list()
    results = list()
    # Execute models in threads
    with ThreadPoolExecutor(max_workers=allocated_cpu) as executor:
        for combination in combination_list.to_dict('records'):
            combination = {key:[value] for key, value in combination.items()}
            future_model = executor.submit(loss_accuracy_analyze_job_builder(X_train, y_train, X_test, y_test, model, combination))
            futures.append(future_model)
        return pd.DataFrame.from_dict([future.result() for future in futures])
In [25]:
from sklearn.linear_model import LogisticRegression


pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
#     ('timeseries_converter', TimeseriesConverter()),
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'status'
                                       ])),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type'])),
    ('low_variance_remover', LowVarianceRemover(.005)), # Decreased to .005
    ('standard_scaler', CustomizedStandardizer(norm='l2')),

])

X = pipe.fit_transform(df)

X_train, X_test = train_test_split(X, test_size=0.3, random_state=seed)

y_train = X_train[:,-1]
X_train = pd.DataFrame(X_train, columns= pipe.steps[-1][1].columns)
X_train = X_train.drop('status', axis=1)
print(X_train.columns)



y_test = X_test[:,-1]
X_test = pd.DataFrame(X_test, columns= pipe.steps[-1][1].columns)
X_test = X_test.drop('status', axis=1)
print(X_test.columns)

start_time = time.time()
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 1)],
    'C': np.logspace(-3, 0, 1),
    'max_iter': np.arange(10,80,1),
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'suffix'], dtype='object')
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'suffix'], dtype='object')
ROC/AUC: 50.0%

              precision    recall  f1-score   support

     Disease       0.00      0.00      0.00        13
      Health       0.99      1.00      0.99      1184

    accuracy                           0.99      1197
   macro avg       0.49      0.50      0.50      1197
weighted avg       0.98      0.99      0.98      1197

--- 00 minutes, 16.88 seconds ---
In [26]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

Visualizer.group_plot_roc_curve('ROC Curve of Logistic Regression', [
    (engine_lr.fpr, engine_lr.tpr, 'Logistic Regression')
])
In [27]:
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': np.logspace(-3, 0, 5),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                        penalty='elasticnet',
                        solver='saga',
                        multi_class='ovr',
                        warm_start=False,
                        n_jobs=allocated_cpu,
)

loss_accuracy_matrix = calculate_grid_performance(X_train, y_train, X_test, y_test, param_lr, lr)
In [28]:
Visualizer.plot_performance(data=loss_accuracy_matrix,
                    legend_type_name='l1_ratio',
                    x_axis_name='C',
                    upper_y_label='loss',
                    lower_y_label='accuracy',
                    title='Loss& Accuracy - Logistic Regression'
)
<Figure size 432x288 with 0 Axes>
In [29]:
Visualizer.plot_importance_trending(X_train, loss_accuracy_matrix, 'Weight change on each feature')
<Figure size 432x288 with 0 Axes>
In [ ]: